{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Phase 2: Train the single models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################################\n",
    "## import packages\n",
    "########################################\n",
    "import os\n",
    "import re\n",
    "import csv\n",
    "import codecs\n",
    "import numpy as np\n",
    "np.random.seed(1337)\n",
    "\n",
    "import pandas as pd\n",
    "import operator\n",
    "import sys\n",
    "\n",
    "import tensorflow as tf\n",
    "from keras.backend.tensorflow_backend import set_session\n",
    "\n",
    "from string import punctuation\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils import to_categorical\n",
    "from iwillwin.trainer.supervised_trainer import KerasModelTrainer\n",
    "from iwillwin.data_utils.data_helpers import DataTransformer, DataLoader, CharDataTransformer\n",
    "from iwillwin.model.sim_zoos import *\n",
    "import tensorflow as tf\n",
    "from iwillwin.config import dataset_config, model_config\n",
    "import keras.backend as K\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, log_loss\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from sklearn.metrics import log_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NB_WORDS = 100000\n",
    "EMBEDDING_DIM = 300\n",
    "MAX_SEQUENCE_LENGTH = 30\n",
    "OUT_SIZE = 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and prepare the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_transformer = DataTransformer(max_num_words=NB_WORDS, max_sequence_length=MAX_SEQUENCE_LENGTH, char_level=False,\n",
    "                                   normalization=True, features_processed=True)\n",
    "trains, tests, labels = data_transformer.prepare_data(dual=False)\n",
    "print(\"Number of unique words\", len(data_transformer.tokenizer.index_docs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.utils import to_categorical\n",
    "from sklearn.utils import class_weight\n",
    "labels = to_categorical(labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare word embedding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Embeddings\")\n",
    "print(os.listdir(\"../data/wordvec\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "sgns_bigram_embedding = data_loader.load_embedding('../data/wordvec/sgns.merge.bigram')\n",
    "tencent_ai_embedding = data_loader.load_embedding('../data/wordvec/Tencent_AILab_ChineseEmbedding.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def build_embedding_matrix(embeddings_index, embedding_size, nb_words=NB_WORDS, word_index=data_transformer.tokenizer.word_index,):\n",
    "    #nb_words = min(nb_words, len(embeddings_index))\n",
    "    embedding_matrix = np.random.rand(nb_words, embedding_size)\n",
    "    word_index = data_transformer.tokenizer.word_index\n",
    "    null_words = open('null-word.txt', 'w', encoding='utf-8')\n",
    "    null_ctr = 0\n",
    "    for word, i in word_index.items():\n",
    "        if i >= nb_words:\n",
    "            null_words.write(word + '\\n')\n",
    "            continue\n",
    "        embedding_vector = embeddings_index.get(word)\n",
    "        if embedding_vector is not None:\n",
    "            embedding_matrix[i] = embedding_vector\n",
    "        else:\n",
    "            print(word)\n",
    "            null_ctr += 1\n",
    "            null_words.write(word + '\\n')\n",
    "    print('Null word embeddings: %d' % null_ctr)\n",
    "    return embedding_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sgns_bigram_matrix = build_embedding_matrix(sgns_bigram_embedding, embedding_size=300)\n",
    "tencent_ai_matrix = build_embedding_matrix(tencent_ai_embedding, embedding_size=200)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Trick Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../data/dataset/train.csv')\n",
    "test_df = pd.read_csv('../data/dataset/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rumor_words_list = ['辟谣', '谣言', '谣传', '传谣', '澄清', '真相', '假新闻', '传言', '造谣', '假消息', '不实', '勿传', '假的', '子虚乌有', '诈骗', '骗局', '以讹传讹']\n",
    "\n",
    "def is_rumor(text):\n",
    "    if type(text) != str:\n",
    "        print(text, type(text))\n",
    "        return 0\n",
    "    energy = 0\n",
    "    for rumor_word in rumor_words_list:\n",
    "        if rumor_word in text:\n",
    "            energy = 1\n",
    "    return energy\n",
    "\n",
    "def has_split_symbol(text):\n",
    "    if type(text) != str:\n",
    "        return 0\n",
    "    if '|' in text:\n",
    "        return 1\n",
    "    return 0\n",
    "\n",
    "for df in [train_df, test_df]:\n",
    "    df['has_|'] = df['title2_zh'].apply(has_split_symbol)\n",
    "    df['has_rumor_words'] = df['title2_zh'].apply(is_rumor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_has_rumor = train_df.has_rumor_words.values\n",
    "test_has_rumor = test_df.has_rumor_words.values\n",
    "\n",
    "trick_trains_features = np.concatenate((trains[2], train_has_rumor.reshape((-1, 1))), axis=1)\n",
    "trick_tests_features = np.concatenate((tests[2], test_has_rumor.reshape((-1, 1))), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def _build_exact_match_sequences(sent_1, sent_2):\n",
    "    sent_1_char_set = set(sent_1)\n",
    "    sent_2_char_set = set(sent_2)\n",
    "    intersection = sent_1_char_set & sent_2_char_set\n",
    "    \n",
    "    sent_1_em = np.zeros_like(sent_1)\n",
    "    sent_2_em = np.zeros_like(sent_2)\n",
    "\n",
    "    for i in range(len(sent_1)):\n",
    "        if sent_1[i] == 0:\n",
    "            continue\n",
    "        if sent_1[i] in intersection:\n",
    "            sent_1_em[i] = 1\n",
    "    \n",
    "    for i in range(len(sent_2)):\n",
    "        if sent_2[i] == 0:\n",
    "            continue        \n",
    "        if sent_2[i] in intersection:\n",
    "            sent_2_em[i] = 1\n",
    "    \n",
    "    return sent_1_em, sent_2_em\n",
    "\n",
    "def build_exact_match_sequences(sents_1, sents_2):\n",
    "    sents_1_em, sents_2_em = [], []\n",
    "    for sent_1, sent_2 in zip(sents_1, sents_2):\n",
    "        sent_1_em, sent_2_em = _build_exact_match_sequences(sent_1, sent_2)\n",
    "        sents_1_em.append(sent_1_em)\n",
    "        sents_2_em.append(sent_2_em)\n",
    "    return np.array(sents_1_em), np.array(sents_2_em)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "trains_1_ems, trains_2_ems = build_exact_match_sequences(trains[0], trains[1])\n",
    "tests_1_ems, tests_2_ems = build_exact_match_sequences(tests[0], tests[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Shape of train em\", trains_1_ems.shape, trains_2_ems.shape)\n",
    "print(\"Shape of test em\", tests_1_ems.shape, tests_2_ems.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "em_train_features = (trains_1_ems, trains_2_ems)\n",
    "em_test_features = (tests_1_ems, tests_2_ems)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tricks ?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "use_tricky = True\n",
    "\n",
    "if use_tricky:\n",
    "    trains = (trains[0], trains[1], trick_trains_features)\n",
    "    tests = (tests[0], tests[1], trick_tests_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import torch.nn.functional as F\n",
    "import importlib\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, log_loss\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "\n",
    "from iwillwin.config import model_config\n",
    "\n",
    "class ModelTrainer(object):\n",
    "\n",
    "    def __init__(self, model_stamp, epoch_num, learning_rate=1e-3,\n",
    "                 shuffle_inputs=False, verbose_round=40, early_stopping_round=8):\n",
    "        self.models = []\n",
    "        self.model_stamp = model_stamp\n",
    "        self.val_loss = -1\n",
    "        self.auc = -1\n",
    "        self.epoch_num = epoch_num\n",
    "        self.learning_rate = learning_rate\n",
    "        self.eps = 1e-10\n",
    "        self.verbose_round = verbose_round\n",
    "        self.early_stopping_round = early_stopping_round\n",
    "        self.shuffle_inputs = shuffle_inputs\n",
    "\n",
    "    def train_folds(self, X, y, fold_count, em_train_features, batch_size, get_model_func, augments=None, skip_fold=0, patience=10, scale_sample_weight=False,\n",
    "                    class_weight=None, self_aware=False, swap_input=False):\n",
    "        X1, X2, features, = X\n",
    "        em1, em2 = em_train_features\n",
    "        features = features\n",
    "        weight_val=scale_sample_weight\n",
    "\n",
    "        fold_size = len(X1) // fold_count\n",
    "        models = []\n",
    "        fold_predictions = []\n",
    "        score = 0\n",
    "\n",
    "        for fold_id in range(0, fold_count):\n",
    "            fold_start = fold_size * fold_id\n",
    "            fold_end = fold_start + fold_size\n",
    "\n",
    "            if fold_id == fold_count - 1:\n",
    "                fold_end = len(X1)\n",
    "\n",
    "            train_x1 = np.concatenate([X1[:fold_start], X1[fold_end:]])\n",
    "            train_x2 = np.concatenate([X2[:fold_start], X2[fold_end:]])\n",
    "            train_features = np.concatenate([features[:fold_start], features[fold_end:]])\n",
    "            \n",
    "            train_em_1 = np.concatenate([em1[:fold_start], em1[fold_end:]])\n",
    "            train_em_2 = np.concatenate([em2[:fold_start], em2[fold_end:]])\n",
    "            \n",
    "            train_y = np.concatenate([y[:fold_start], y[fold_end:]])\n",
    "            \n",
    "            val_x1 = X1[fold_start:fold_end]\n",
    "            val_x2 = X2[fold_start:fold_end]\n",
    "            val_features = features[fold_start:fold_end]\n",
    "            val_em1 = em1[fold_start:fold_end]\n",
    "            val_em2 = em2[fold_start:fold_end]\n",
    "            val_y = y[fold_start:fold_end]\n",
    "\n",
    "            fold_pos = (np.sum(train_y) / len(train_x1))\n",
    "\n",
    "            train_data = {\n",
    "                \"first_sentences\": train_x1,\n",
    "                \"second_sentences\": train_x2,\n",
    "                \"mata-features\": train_features,\n",
    "                \"first_exact_match\": train_em_1,\n",
    "                \"second_exact_match\": train_em_2,\n",
    "            }\n",
    "\n",
    "            val_data = {\n",
    "                \"first_sentences\": val_x1,\n",
    "                \"second_sentences\": val_x2,\n",
    "                \"mata-features\": val_features,\n",
    "                \"first_exact_match\": val_em1,\n",
    "                \"second_exact_match\": val_em2,\n",
    "            }\n",
    "\n",
    "            model, bst_val_score, fold_prediction = self._train_model_by_logloss(\n",
    "                get_model_func(), batch_size, train_data, train_y, val_data, val_y, fold_id, patience, class_weight, weight_val=weight_val)\n",
    "    \n",
    "            score += bst_val_score\n",
    "            models.append(model)\n",
    "            fold_predictions.append(fold_prediction)\n",
    "\n",
    "        self.models = models\n",
    "        self.val_loss = score / fold_count\n",
    "        return models, self.val_loss, fold_predictions\n",
    "\n",
    "    def _train_model_by_logloss(self, model, batch_size, train_x, train_y, val_x, val_y, fold_id, patience):\n",
    "        # return a list which holds [models, val_loss, auc, prediction]\n",
    "        raise NotImplementedError\n",
    "\n",
    "class KerasModelTrainer(ModelTrainer):\n",
    "\n",
    "    def __init__(self, *args, **kwargs):\n",
    "        super(KerasModelTrainer, self).__init__(*args, **kwargs)\n",
    "        pass\n",
    "\n",
    "    def _train_model_by_logloss(self, model, batch_size, train_x, train_y, val_x, val_y, fold_id, patience, class_weight, weight_val):\n",
    "        early_stopping = EarlyStopping(monitor='val_loss', patience=patience)\n",
    "        bst_model_path = self.model_stamp + str(fold_id) + '.h5'\n",
    "        #val_data = (val_x, val_y, weight_val) if weight_val is not None else (val_x, val_y)\n",
    "        val_data =  (val_x, val_y)\n",
    "        model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)\n",
    "        hist = model.fit(train_x, train_y,\n",
    "                         validation_data=val_data,\n",
    "                         epochs=self.epoch_num, batch_size=batch_size, shuffle=True,\n",
    "                         class_weight=class_weight,\n",
    "                         callbacks=[early_stopping, model_checkpoint],)\n",
    "        bst_val_score = max(hist.history['val_loss'])\n",
    "        model.load_weights(bst_model_path)\n",
    "        predictions = model.predict(val_x)\n",
    "\n",
    "        return model, bst_val_score, predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Path Setting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "oofs_dir = \"../data/oofs/\"\n",
    "output_dir = \"../data/output/\"\n",
    "onehot_pred_dir = \"../data/one_hot_pred/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def make_predictions():\n",
    "    # REFACTOR: remove this lazy solution\n",
    "    test_predicts_list = []\n",
    "    oofs_predictions = []\n",
    "    pre_trained_models = []\n",
    "    \n",
    "    oofs_path = oofs_dir + model_submit_prefix\n",
    "    output_path = output_dir + model_submit_prefix\n",
    "    one_hot_pred_path = onehot_pred_dir + \"One-Hot\" + model_submit_prefix\n",
    "\n",
    "    print(\"Predicting training results...\")\n",
    "    train_predicts = np.concatenate(folds_preds, axis=0)\n",
    "    oofs = pd.DataFrame({\"unrelated\": train_predicts[:, 0], \"agreed\": train_predicts[:, 1], \"disagreed\": train_predicts[:, 2]})\n",
    "    score = numpy_weighted_accuracy(labels, oofs[['unrelated', 'agreed', 'disagreed']].values)\n",
    "    submit_path = oofs_path + \"-Train-L{:4f}-NB{:d}.csv\".format(score, NB_WORDS)\n",
    "    oofs.to_csv(submit_path, index=False)\n",
    "\n",
    "\n",
    "    print(\"Predicting testing results...\")\n",
    "    test_predicts_list = []\n",
    "    for fold_id, model in enumerate(models):\n",
    "        test_predicts = model.predict({\"first_sentences\":tests[0],\n",
    "                                       \"second_sentences\":tests[1],\n",
    "                                       \"mata-features\":tests[2],\n",
    "                                       \"first_exact_match\": tests_1_ems,\n",
    "                                       \"second_exact_match\": tests_2_ems,\n",
    "                                      }, batch_size=128, verbose=1)\n",
    "\n",
    "        test_predicts_list.append(test_predicts)\n",
    "\n",
    "    test_predicts = np.zeros(test_predicts_list[0].shape)\n",
    "    for fold_predict in test_predicts_list:\n",
    "        test_predicts += fold_predict\n",
    "    test_predicts /= len(test_predicts_list)\n",
    "\n",
    "    test_predicts = pd.DataFrame({\"unrelated\": test_predicts[:, 0], \"agreed\": test_predicts[:, 1], \"disagreed\": test_predicts[:, 2]})\n",
    "    submit_path = output_path + \"-L{:4f}-NB{:d}.csv\".format(score, NB_WORDS)\n",
    "    test_predicts.to_csv(submit_path, index=False) # 0.3343\n",
    "\n",
    "    print(\"Predicting labeled testing results...\")\n",
    "    ids = pd.read_csv(\"../data/dataset/test.csv\")\n",
    "    pred_labels = test_predicts.idxmax(axis=1)\n",
    "    sub = pd.DataFrame({\"Id\": ids['id'].values, \"Category\": pred_labels})\n",
    "    submit_path = one_hot_pred_path + \"-L{:4f}-NB{:d}.csv\".format(score, NB_WORDS)\n",
    "    sub.to_csv(submit_path, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DenseCNN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TenCent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = tencent_ai_matrix\n",
    "EMBEDDING_DIM = 200\n",
    "\n",
    "models_checkpoints_path = \"WordTC-DenseCNN5Layers-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordSGNS-DenseCNN5Layers-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_dense_cnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=1024,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SGNS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = sgns_bigram_matrix\n",
    "EMBEDDING_DIM = 300\n",
    "\n",
    "models_checkpoints_path = \"WordSGNS-DenseCNN5Layers-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordSGNS-DenseCNN5Layers-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_dense_cnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "    return model_func(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=1024,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Multiwindow-Gated-CNN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TenCent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = tencent_ai_matrix\n",
    "EMBEDDING_DIM = 200\n",
    "\n",
    "models_checkpoints_path = \"WordTC-Gated4GWindows-NoMeta-3P-NoClassWeighted-3Layers-NoEM\"\n",
    "model_submit_prefix = \"WordTC-Gated4GWindows-NoMeta-3P-NoEM-NoClassWeighted-3Layers-NoEM\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_multiwindow_cnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=128,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SGNS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = sgns_bigram_matrix\n",
    "EMBEDDING_DIM = 300\n",
    "\n",
    "models_checkpoints_path = \"SGNS-Gated4GWindows-NoMeta-3P-NoClassWeighted-3Layers-NoEM\"\n",
    "model_submit_prefix = \"SGNS-Gated4GWindows-NoMeta-3P-NoEM-NoClassWeighted-3Layers-NoEM\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_multiwindow_cnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=128,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ESIM\n",
    "## Tencent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = tencent_ai_matrix\n",
    "EMBEDDING_DIM = 200\n",
    "\n",
    "models_checkpoints_path = \"WordTC-ESIM-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordTC-ESIM-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_ESIM(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE, lr=4e-4)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=64,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=7)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## SGNS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = sgns_bigram_matrix\n",
    "EMBEDDING_DIM = 300\n",
    "\n",
    "models_checkpoints_path = \"WordSGNS-ESIM-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordSGNS-ESIM-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_ESIM(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=64,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=10)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DenseRNN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TenCent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = tencent_ai_matrix\n",
    "EMBEDDING_DIM = 200\n",
    "\n",
    "models_checkpoints_path = \"WordTC-DenseRNN-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordTC-DenseRNN-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_darnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=1024,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SGNS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fold_count = 8\n",
    "embedding_matrix = sgns_bigram_matrix\n",
    "EMBEDDING_DIM = 300\n",
    "\n",
    "models_checkpoints_path = \"WordSGNS-DenseRNN-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "model_submit_prefix = \"WordSGNS-DenseRNN-NoMeta-3P-NoEM-NoClassWeighted-3Layers\"\n",
    "\n",
    "def _agent_get_model():\n",
    "    return get_darnn(NB_WORDS, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, OUT_SIZE)\n",
    "\n",
    "trainer = KerasModelTrainer(model_stamp=models_checkpoints_path, epoch_num=500)\n",
    "models, score, folds_preds = trainer.train_folds(X=trains, y=labels, augments=None, fold_count=fold_count, batch_size=1024,\n",
    "    em_train_features=em_train_features, get_model_func=_agent_get_model, patience=20)\n",
    "\n",
    "print(\"score\", score)\n",
    "make_predictions()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 1,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
